# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
Load in your dataset and describe its properties through the questions below. Try and motivate your exploration goals through this section.
df = pd.read_csv('pisa2012.csv', encoding='latin-1')
df.head()
df.shape
Due to having such large amount of features, I wish to limit the number of features.
For my analysis, I wanted to focus on how gender plays a role in not only students' mathematics scores but also students' attitude towards mathematics. I also wanted to take a look at if the parents attitude towards children's mathematics differ among the gender. For these reasons, I have decided to focus on the following features:
Student Information
Student Mathematics Scores
Student's Attitude Towards Mathematics
Math Anxiety
Math Self-Concept
Math Interest
Math Work Ethic
Math Behaviour
Subjective Norms
#organize features by category list
#CNT: country, ST04Q01: gender
student_info=['CNT','ST04Q01']
#PV1MATH: Overall Math Score
scores=['PV1MATH']
#ST29Q02: Worthwhile for Work, ST29Q05: Worthwhile for Career Chances, ST29Q07: Important for Future Study
#ST29Q08: Helps to Get a Job
motivation=['ST29Q02','ST29Q05','ST29Q07','ST29Q08']
#ST42Q01: Worry That It Will Be Difficult, ST42Q03: Get Very Tense, ST42Q05: Get Very Nervous
#ST42Q08: Feel Helpless, ST42Q10: Worry About Getting Poor <Grades>
anxiety=['ST42Q01','ST42Q03','ST42Q05','ST42Q08','ST42Q10']
#ST42Q02: Not Good at Maths, ST42Q04: Get Good (Grades), ST42Q06: Learn Quickly
#ST42Q07: One of Best Subjects, ST42Q09: Understand Difficult Work
self=['ST42Q02','ST42Q04','ST42Q06','ST42Q07','ST42Q09']
#ST29Q01: Enjoy Reading, ST29Q03: Look Forward to Lessons, ST29Q04: Enjoy Maths, ST29Q06: Interested
interest=['ST29Q01','ST29Q03','ST29Q04','ST29Q06']
#ST46Q01: Homework Completed in Time, ST46Q02: Work Hard on Homework, ST46Q03: Prepared for Exams
#ST46Q04: Study Hard for Quizzes, ST46Q05: Study Until I Understand Everything, ST46Q06: Pay Attention in Classes
#ST46Q07: Listen in Classes, ST46Q08: Avoid Distractions When Studying, ST46Q09: Keep Work Organized
work_ethic= ['ST46Q01','ST46Q02','ST46Q03','ST46Q04','ST46Q05','ST46Q06','ST46Q07','ST46Q08','ST46Q09']
#ST49Q01: Talk about Maths with Friends,ST49Q02: Help Friends with Maths, ST49Q03: (Extracurricular) Activity
#ST49Q04: Participate in Competitions, ST49Q05: Study More Than 2 Extra Hours a Day, ST49Q06: Play Chess
#ST49Q07: Computer programming, ST49Q09: Participate in Math Club
behavior=['ST49Q01','ST49Q02','ST49Q03','ST49Q04','ST49Q05','ST49Q06','ST49Q07','ST49Q09']
#ST35Q04: Parents Believe Studying Mathematics Is Important
#ST35Q05: Parents Believe Mathematics Is Important for Career
#ST35Q06:Parents Like Mathematics
parents=['ST35Q04','ST35Q05','ST35Q06']
features = [student_info, scores, motivation, anxiety, self, interest, work_ethic, behavior, parents]
str_features = ['student_info', 'scores', 'motivation', 'anxiety', 'self',' interest', 'work_ethic', 'behavior', 'parents']
print(str_features)
for i,_ in enumerate(features):
print(str_features[i])
for feature in _:
print(df[feature].value_counts(), '\n')
for i,_ in enumerate(features):
print(str_features[i], '\n')
print(df[_].isnull().any(axis=1).sum())
print(df[_].isnull().any(axis=1).mean(), '\n')
df['CNT'].value_counts()
attitudes=[motivation,anxiety,self,interest,work_ethic,behavior]
attitudes_str = ['motivation','anxiety','self','interest','work_ethic','behavior']
for i,_ in enumerate(features):
print(str_features[i])
print(df[_].head(), '\n')
for i,_ in enumerate(attitudes):
print(attitudes_str[i])
for feature in _:
print(df[feature].value_counts(), '\n')
df.duplicated().sum()
keep=['CNT', 'ST04Q01', 'PV1MATH','ST29Q02', 'ST29Q05', 'ST29Q07', 'ST29Q08', 'ST42Q01',
'ST42Q03', 'ST42Q05', 'ST42Q08','ST42Q10', 'ST42Q02', 'ST42Q04', 'ST42Q06',
'ST42Q07', 'ST42Q09', 'ST29Q01', 'ST29Q03', 'ST29Q04', 'ST29Q06','ST46Q01',
'ST46Q02', 'ST46Q03', 'ST46Q04', 'ST46Q05', 'ST46Q06', 'ST46Q07', 'ST46Q08',
'ST46Q09','ST49Q01', 'ST49Q02', 'ST49Q03', 'ST49Q04', 'ST49Q05', 'ST49Q06',
'ST49Q07', 'ST49Q09','ST35Q04', 'ST35Q05', 'ST35Q06']
df_new = df.copy()
df_new = df_new[keep]
df_new.dropna(inplace = True)
df_new.shape
#Setting value points for the features
points_1={'Strongly agree':4,'Agree': 3,'Disagree': 2, 'Strongly disagree':1}
points_2={'Always or almost always': 4, 'Often':3, 'Sometimes':2, 'Never or rarely':1}
points_3 = {'Strongly agree':1, 'Agree': 2, 'Disagree': 3, 'Strongly disagree':4}
#Create a function to scale the features and get new col for each collection of features
def com_features(features, comb_value, value_point):
for feature in features:
df_new.loc[:,feature] = df_new.loc[:,feature].map(value_point)
df_new[comb_value] = df_new.loc[:,features].sum(axis = 1, skipna = False)/len(features)
#Motivation
com_features(motivation, 'motivation', points_1)
#Anxiaty
com_features(anxiety, 'anxiety', points_1)
#Math Interest
com_features(interest, 'interest', points_1)
com_features(parents, 'parents', points_1)
#Math Work Ethics
com_features(work_ethic, 'work_ethic', points_1)
#Behavior
com_features(behavior, 'behavior', points_2)
#Self_Concept
com_features(self, 'self_concept', points_3)
df_new.rename(columns = {'CNT': 'country', 'ST04Q01': 'gender', 'PV1MATH':'math_score'}, inplace = True)
full_features = ['country', 'gender', 'math_score', 'motivation', 'anxiety', 'interest', 'parents', 'work_ethic', 'behavior', 'self_concept']
df_full = df_new[full_features]
states=['Florida (USA)','Massachusetts (USA)','Connecticut (USA)']
df_full['country'].replace(states, 'United States of America', inplace=True)
df_full.to_csv('psia2012_cleaned.csv', index_label= False)
In this section, investigate distributions of individual variables. If you see unusual points or outliers, take a deeper look to clean things up and prepare yourself to look at relationships between variables.
df_ = pd.read_csv('psia2012_cleaned.csv')
df_.head()
Make sure that, after every plot or related series of plots, that you include a Markdown cell with comments about what you observed, and what you plan on investigating next.
df_['gender'].value_counts()
plt.pie(df_['gender'].value_counts(), colors = ['r', 'b'], labels = ['Females', 'Males']);
plt.title('Students By Gender', size=15)
order = df_['country'].value_counts().index
plt.figure(figsize = [20,30])
sns.countplot(data = df_, y = 'country', order = order)
plt.title('Number of Students in Countries', size=25)
plt.show()
plt.figure(figsize=(20,8))
sns.boxplot(df_['country'].value_counts(), color='gray')
plt.title('NUMBER OF STUDENTS BY COUNTRY', size=20);
df_['math_score'].describe()
#Overall Math Score
plt.hist(df_['math_score'], bins = np.arange(41,924+50,30))
plt.title("Overall Math Score", size = 20)
attitudes = ['motivation', 'anxiety','interest', 'work_ethic', 'behavior', 'self_concept']
fig, ax = plt.subplots(nrows = 2, ncols=3, figsize = (20,12))
ax = ax.flatten()
#binsize = 1/number of questions per section
binsizes=[1/4,1/5,1/4,1/9,1/8,1/5]
for i,feature in enumerate(attitudes):
bins=np.arange(min(df_[feature]), max(df_[feature]) +binsizes[i], binsizes[i])
ax[i].hist(data=df_, x=feature, bins=bins)
ax[i].set_xlabel('Score')
ax[i].set_ylabel('Number of Students')
ax[i].set_title(feature)
ax[i].grid()
binsize=1/3
bins=np.arange(df_['parents'].min(),df_['parents'].max() +binsize, binsize)
plt.hist(data=df_, x='parents', bins=bins)
plt.xlabel('score')
plt.ylabel('number of students')
plt.title('STUDENT\'S PERCEPTION OF PARENTAL ATTITUDE TOWARDS MATHEMATICS');
In this section, investigate relationships between pairs of variables in your data. Make sure the variables that you cover here have been introduced in some fashion in the previous section (univariate exploration).
plt.figure(figsize = (20,30))
sns.countplot(data = df_, y = 'country', hue = 'gender', palette = 'rocket', order=order)
plt.legend(prop={'size': 15})
plt.title('Number of Student in Country', size = 25)
plt.show()
df_gender = pd.DataFrame(df_.groupby(['country', 'gender']).size()).reset_index().pivot(index = 'country', columns = 'gender', values = 0).reset_index()
df_gender['total'] = df_gender['Female']+df_gender['Male']
df_gender['mpercent'] = 100*df_gender['Male']/(df_gender['total'])
df_gender['fpercent'] = 100*df_gender['Female']/(df_gender['total'])
df_gender['difference'] = np.absolute(df_gender['fpercent'] - df_gender['mpercent'])
sns.boxplot(data = df_gender, x = 'difference', color='pink')
sns.swarmplot(data = df_gender, x = 'difference', color='crimson')
plt.title('Percent Difference in Gender by Country', size=20);
df_gender.query("difference > 12.3")
df_gender.sort_values(by = 'total', ascending = False).head(10)
df_.groupby('gender')['math_score'].plot(kind = 'hist', alpha=0.5, bins=50)
plt.legend()
plt.title('Math Score Distribution', size = 20);
sns.boxplot(data = df_, y = 'gender', x = 'parents', orient="h")
plt.title('Parents\' effect with Gender', size = 15);
Both are greatly affected by their parents, but males seem to be less more. (shown in the third quartile)
#Attitudes
attitude=['motivation', 'anxiety','interest', 'work_ethic', 'behavior', 'self_concept']
fig, ax= plt.subplots(nrows=2,ncols=3,figsize=[20,12])
#binsizes = 1/(number of questions per section)
binsizes=[1/4,1/5,1/4,1/9,1/8,1/5]
ax=ax.flatten()
for i,feature in enumerate(attitude):
bins=np.arange(min(df_[feature]), max(df_[feature]) +binsizes[i], binsizes[i])
ax[i].hist(data=df_.query('gender=="Female"'), x=feature, bins=bins,label='Female', alpha=.4)
ax[i].hist(data=df_.query('gender=="Male"'),x=feature, bins=bins,label='Male', alpha=.4)
ax[i].set_xlabel('Score')
ax[i].set_ylabel('Number of Students')
ax[i].set_title(feature)
ax[i].legend()
ax[i].grid()
#Check if there is any correlations between the features
plt.figure(figsize=(10,8))
sns.heatmap(df_.corr(), cmap="RdBu_r", annot=True, vmin=-1, vmax=1)
plt.xticks(rotation = 15)
plt.title("Correlation Plot", size=12)
plt.show()
We can see that there is a moderate positive correlation between motivation and (interest-parents-work_ethic,behavior) and a negative moderate correlation between self_concept and(interest, work_ethics, behavior).
Create plots of three or more variables to investigate your data even further. Make sure that your investigations are justified, and follow from your work in the previous sections.
a = sns.FacetGrid(data = df_, col = 'country', col_wrap = 4, hue='gender')
a.map(sns.kdeplot, 'motivation', bw = 1/4)
for ax, c in zip(a.axes.flat, df_.country):
ax.legend()
ax.grid(color = 'pink', linestyle = '--')
Due to the fact that most countries do not have an even 50/50 split among gender, I wanted to depict the motivation scores by density.
Almost all countries show the male students outweighing female students in motivation scores.
a = sns.FacetGrid(data = df_, col = 'country', col_wrap = 4, hue='gender')
a.map(sns.kdeplot, 'anxiety', bw = 1/4)
for ax, c in zip(a.axes.flat, df_.country):
ax.legend()
ax.grid(color = 'pink', linestyle = '--')
Almost all countries show the female students outweighing male students in anxiety scores.
a = sns.FacetGrid(data = df_, col = 'country', col_wrap = 4, hue='gender')
a.map(sns.kdeplot, 'self_concept', bw = 1/4)
for ax, c in zip(a.axes.flat, df_.country):
ax.legend()
ax.grid(color = 'pink', linestyle = '--')
Almost all countries show the female students outweighing male students in self_concept scores.
a = sns.FacetGrid(data = df_, col = 'country', col_wrap = 4, hue='gender')
a.map(sns.kdeplot, 'interest', bw = 1/4)
for ax, c in zip(a.axes.flat, df_.country):
ax.legend()
ax.grid(color = 'pink', linestyle = '--')
Almost all countries show the male students outweighing female students in interest scores.
a = sns.FacetGrid(data = df_, col = 'country', col_wrap = 4, hue='gender')
a.map(sns.kdeplot, 'work_ethic', bw = 1/4)
for ax, c in zip(a.axes.flat, df_.country):
ax.legend()
ax.grid(color = 'pink', linestyle = '--')
Almost all countries show the male students are the same as female students in wrok_ethic scores.
a = sns.FacetGrid(data = df_, col = 'country', col_wrap = 4, hue='gender')
a.map(sns.kdeplot, 'behavior', bw = 1/4)
for ax, c in zip(a.axes.flat, df_.country):
ax.legend()
ax.grid(color = 'pink', linestyle = '--')
Almost all countries show the male students outweighing female students in motivation scores.
At the end of your report, make sure that you export the notebook as an html file from the
File > Download as... > HTMLmenu. Make sure you keep track of where the exported file goes, so you can put it in the same folder as this notebook for project submission. Also, make sure you remove all of the quote-formatted guide notes like this one before you finish your report!
a = sns.FacetGrid(data = df_, col = 'country', col_wrap = 4, hue='gender')
a.map(sns.lineplot, 'motivation', 'math_score', ci = False)
for ax, c in zip(a.axes.flat, df_.country):
ax.legend()
ax.grid(color = 'pink', linestyle = '--')
a = sns.FacetGrid(data = df_, col = 'country', col_wrap = 4, hue='gender')
a.map(sns.lineplot, 'anxiety', 'math_score', ci = False)
for ax, c in zip(a.axes.flat, df_.country):
ax.legend()
ax.grid(color = 'pink', linestyle = '--')
a = sns.FacetGrid(data = df_, col = 'country', col_wrap = 4, hue='gender')
a.map(sns.lineplot, 'self_concept', 'math_score', ci = False)
for ax, c in zip(a.axes.flat, df_.country):
ax.legend()
ax.grid(color = 'pink', linestyle = '--')
a = sns.FacetGrid(data = df_, col = 'country', col_wrap = 4, hue='gender')
a.map(sns.lineplot, 'interest', 'math_score', ci = False)
for ax, c in zip(a.axes.flat, df_.country):
ax.legend()
ax.grid(color = 'pink', linestyle = '--')
a = sns.FacetGrid(data = df_, col = 'country', col_wrap = 4, hue='gender')
a.map(sns.lineplot, 'work_ethic', 'math_score', ci = False)
for ax, c in zip(a.axes.flat, df_.country):
ax.legend()
ax.grid(color = 'pink', linestyle = '--')
a = sns.FacetGrid(data = df_, col = 'country', col_wrap = 4, hue='gender')
a.map(sns.lineplot, 'behavior', 'math_score', ci = False)
for ax, c in zip(a.axes.flat, df_.country):
ax.legend()
ax.grid(color = 'pink', linestyle = '--')
Both genders are doing bad in scores for high score of behavior
sns.pointplot(data = df_, x = 'motivation', y = 'math_score', hue = 'gender')
plt.show()
sns.pointplot(data = df_, x = 'anxiety', y = 'math_score', hue = 'gender')
plt.show()
sns.pointplot(data = df_, x = 'self_concept', y = 'math_score', hue = 'gender')
plt.show()
sns.pointplot(data = df_, x = 'interest', y = 'math_score', hue = 'gender')
plt.show()
sns.pointplot(data = df_, x = 'work_ethic', y = 'math_score', hue = 'gender')
plt.show()
sns.pointplot(data = df_, x = 'behavior', y = 'math_score', hue = 'gender')
plt.show()